library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------------------------------------------------------
as.difftime(): lubridate, base
date():        lubridate, base
filter():      dplyr, stats
intersect():   lubridate, base
lag():         dplyr, stats
setdiff():     lubridate, base
union():       lubridate, base
library(stringr)
library(anytime)
library(lubridate)
library(skimr)
library(listviewer)
library(rlang)

Attaching package: ‘rlang’

The following objects are masked from ‘package:purrr’:

    %||%, %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl, invoke, is_atomic,
    is_bare_atomic, is_bare_character, is_bare_double, is_bare_integer, is_bare_list, is_bare_logical,
    is_bare_numeric, is_bare_vector, is_character, is_double, is_empty, is_formula, is_function, is_integer,
    is_list, is_logical, is_null, is_scalar_atomic, is_scalar_character, is_scalar_double, is_scalar_integer,
    is_scalar_list, is_scalar_logical, is_scalar_vector, is_vector, list_along, prepend, rep_along, set_names,
    splice

The following object is masked from ‘package:tibble’:

    has_name
library(forcats)
library(ggjoy)
library(padr)
library(hrbrthemes)
hrbrthemes is under *active* development. See https://github.com/hrbrmstr/hrbrthemes for info/news.
library(janitor)
load("output/trennid_toodeldud.RData")

Tutvu andmetega

glimpse(trennid_toodeldud)
Observations: 75,808
Variables: 14
$ profile          <dbl> 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837...
$ synnipaev        <date> 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1...
$ sugu             <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "M...
$ pikkus_cm        <dbl> 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 180, 180, 169, 169, 169, 169, 169...
$ kaal_kg          <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ lemmik_spordiala <chr> "Cycling (Sport)", "Cycling (Sport)", "Cycling (Sport)", "Cycling (Sport)", "Cycling (Sport)", "Cy...
$ workout          <dbl> 519605661, 349744705, 345915031, 345404462, 344247597, 341731612, 340579183, 340106467, 339578378,...
$ workout_date     <dttm> 2017-05-10 12:35:00, 2017-06-01 14:41:00, 2017-05-25 14:42:00, 2017-05-24 16:10:00, 2017-05-22 16...
$ spordiala        <chr> "riding", "cycling", "cycling", "cycling", "cycling", "cycling", "doing weight training", "cycling...
$ trenni_kestvus   <S4: Period> 59S, 1H 10M 22S, 1H 12M 16S, 59M 46S, 1H 13M 22S, 1H 7M 0S, 1H 29M 22S, 1H 9M 54S, 1H 14M 0...
$ distants_km      <dbl> NA, 27.20, 27.36, 21.29, 30.07, 26.20, NA, 26.58, 27.16, 23.22, 6.55, 1.06, 2.84, NA, NA, NA, 4.01...
$ raw_profile_info <chr> "\n                Country:\n                        Estonia\n                    Birthday:\n     ...
$ raw_workout_date <chr> "May 10 at 11:35", "June 01 at 13:41", "May 25 at 13:42", "May 24 at 15:10", "May 22 at 15:58", "M...
$ raw_workout_info <chr> "was out riding for 0m:59s.", "was out cycling. He tracked 27.20 km in 1h:10m:22s.", "was out cycl...

Ülevaade kõigist veergudest

# Funktsioon leiab valitud veeru kõik unikaalsed väärtused koos esinemissagedusega
unikaalsed <- function(x){
    valik <- sym(x)
    
    p <- trennid_toodeldud %>% 
        mutate(uus = str_replace_na(!!valik)) %>% 
        count(uus) %>% 
        mutate(uus = str_c(uus, " (", n, ")"),
               uus = fct_reorder(uus, n)) %>% 
        arrange(desc(n)) %>% 
        select(uus)
    
    colnames(p) <- x
    
    return(p)
}
# kõik veerud, mis pole ID või muul põhjusel välistatud
veerud <- trennid_toodeldud %>% 
    select(-profile, -workout, -contains("raw")) %>%
    colnames()
# leia data frame kõigi veergude unikaalsed väärtused
p <- map(veerud, unikaalsed)
# kuva unikaalsed väärtused
p %>% 
    map(., as.list) %>% 
    flatten() %>% 
    jsonedit()

TOP 15 populaarsemat spordiala

top_15_spordiala <- trennid_toodeldud %>% 
  filter(spordiala != "exercising") %>%  # välista mittemidagiütlev "exercising"
  count(spordiala, sort = TRUE) %>% 
  head(15) %>% 
  .$spordiala
top_15_spordiala
 [1] "walking"                "running"                "cycling"                "mountain biking"       
 [5] "skating"                "doing weight training"  "skiing"                 "hiking"                
 [9] "doing aerobics"         "doing circuit training" "swimming"               "orienteering"          
[13] "roller skiing"          "golfing"                "dancing"               

TOP 10 lemmik spordiala

top_10_lemmik_spordiala <- trennid_toodeldud %>% 
  distinct(profile, lemmik_spordiala) %>% 
  mutate(lemmik_spordiala = case_when(.$lemmik_spordiala == "Walking (Fitness)" ~ "walking",
                                      .$lemmik_spordiala == "Skiing (Cross country)" ~ "skiing",
                                      .$lemmik_spordiala == "Football (Soccer)" ~ "soccer",
                                      TRUE ~ .$lemmik_spordiala),
         lemmik_spordiala = str_to_lower(lemmik_spordiala)) %>% 
  count(lemmik_spordiala, sort = TRUE) %>% 
  head(10) %>% 
  .$lemmik_spordiala
top_10_lemmik_spordiala
 [1] "walking"             "running"             "cycling (sport)"     "cycling (transport)" "mountain biking"    
 [6] "roller skating"      "weight training"     "skiing"              "orienteering"        "soccer"             

Populaarsemad spordialad eesti keeles

spordiala_est <- tribble(
  ~spordiala, ~spordiala_est,
  "walking", "käimine",
  "running", "jooksmine",
  "cycling", "jalgrattasõit",
  "cycling (sport)", "jalgrattasõit (sport)",
  "cycling (transport)", "jalgrattasõit (transport)",
  "mountain biking", "maastikuratta sõit",
  "skating", "rulluisutamine",
  "roller skating", "rulluisutamine",
  "doing weight training", "jõutrenn",  
  "weight training", "jõutrenn",  
  "skiing", "suusatamine",  
  "hiking", "matkamine",  
  "dancing", "tantsimine",  
  "doing aerobics", "aeroobika",
  "doing circuit training", "ringtreening",
  "swimming", "ujumine",  
  "orienteering", "orienteerumine",  
  "roller skiing", "rullsuusatamine",  
  "golfing", "golf",
  "soccer", "jalgpall"
)

Ülevaade analüüsis kasutatud trennide ja inimetse arvust

trennid_toodeldud %>% 
  filter(!is.na(spordiala)) %>% 
  group_by(spordiala) %>% 
  summarise(trennide_arv = n(),
            kasutajate_arv = n_distinct(profile)) %>% 
  ungroup() %>% 
  mutate(label = str_c(trennide_arv, " (", kasutajate_arv, ")")) %>% 
  arrange(desc(trennide_arv)) %>% 
  head(30) %>% 
  ggplot(aes(fct_reorder(spordiala, trennide_arv), trennide_arv)) +
  geom_col() +
  geom_text(aes(label = label), hjust = -0.1) +
  scale_y_continuous(breaks = (seq(0, 35000, by = 10000)),
                     limits = c(0, 35000),
                     expand = c(0, 0)) +
  coord_flip() +
  theme_ipsum_rc() +
  labs(y = "trennide arv",
       x = "spordiala",
       title = "Trennide ja treenijate arv kokku",
       subtitle = "TOP 30 spordiala\nSulgudes on spordialaga tegelenud inimeste arv")

Mis kell tehakse trenni nädalavahetusel?

top_trennid_nadalavahetusel <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>%  # ainult top 15 alad
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date)) %>%  # välista ilma kellaajata trennid
  mutate(nadalapaev = weekdays(workout_date)) %>%  # leia trenni nädalapäev
  filter(nadalapaev %in% c("Saturday", "Sunday")) %>%  # ainult nädalavahetused
  thicken(by = "workout_date",  interval = "15 mins") %>%  # ümarda kõik kellaajad 15 min täpsusega
  # asenda kõik kuupäevad 01.01.2017
  # nii on lihtsam andmed ühele skaalale viia
  mutate(aeg = ymd_hms(str_replace(workout_date_15_min, "201.-..-..", "2017-01-01"))) %>% 
  select(spordiala_est, aeg) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  filter(aeg > ymd_hms("2017-01-01 06:00:00")) %>%  # ainult pärast kl 6 tehtud trennid
  ungroup() %>% 
  mutate(n_total = n(),  # kogu trenni arvu kuvamiseks
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, aeg)
Joining, by = "spordiala"
Datetime variable was unsorted, result will be unsorted as well.
top_trennid_nadalavahetusel %>% 
  ggplot(aes(aeg, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  # formati x-teljel kellaaeg
  scale_x_datetime(labels = function(x) format(x, "%H:%M"),
                   breaks = seq(ymd_hms("2017-01-01 06:00:00"),
                                ymd_hms("2017-01-01 24:00:00"), "3 hours"),
                   expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +  # vahelduvad värvitoonid
  labs(x = "kellaaeg",
       y = "spordiala",
       title = "Mis kell tehakse trenni nädalavahetusel?",
       subtitle = str_c("ca ", round(max(top_trennid_nadalavahetusel$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))

Mis kell tehakse trenni tööpäevadel?

top_trennid_toopaevadel <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date)) %>% 
  mutate(nadalapaev = weekdays(workout_date)) %>% 
  filter(!nadalapaev %in% c("Saturday", "Sunday")) %>% 
  thicken(by = "workout_date",  interval = "15 mins") %>% 
  mutate(aeg = ymd_hms(str_replace(workout_date_15_min, "201.-..-..", "2017-01-01"))) %>% 
  select(spordiala_est, aeg, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  select(-profile) %>% 
  filter(aeg > ymd_hms("2017-01-01 06:00:00")) %>% 
  ungroup() %>% 
  mutate(n_total = n(),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, aeg)
Joining, by = "spordiala"
Datetime variable was unsorted, result will be unsorted as well.
top_trennid_toopaevadel %>% 
  ggplot(aes(aeg, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  # theme_joy() +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_datetime(labels = function(x) format(x, "%H:%M"),
                   breaks = seq(ymd_hms("2017-01-01 06:00:00"),
                                ymd_hms("2017-01-01 24:00:00"), "3 hours"),
                   expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "kellaaeg",
       y = "spordiala",
       title = "Mis kell tehakse trenni tööpäevadel?",
       subtitle = str_c("ca ", round(max(top_trennid_toopaevadel$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))

Mis nädalapäevadel mingit spordiala harrastatakse?

top_trennid_paevade_loikes <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date)) %>% 
  # nädalapäeva number (1 = esmaspäev)
  mutate(nadalapaev = wday(workout_date, week_start = 1)) %>% 
  select(spordiala_est, nadalapaev, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  select(-profile) %>% 
  ungroup() %>% 
  mutate(n_total = n(),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, nadalapaev)
Joining, by = "spordiala"
top_trennid_paevade_loikes %>% 
  ggplot(aes(nadalapaev, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  geom_vline(xintercept = 5.5) +
  annotate("text", x = 6.5, y = 16.5, label = "nädalavahetuse") +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(1, 7, by = 1),
                     labels = c("esmasp", "teisip", "kolmap", "neljap", "reede", "laup", "pühap"),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "päev",
       y = "spordiala",
       title = "Mis päeval spordiala harrastatakse?",
       subtitle = str_c("ca ", round(max(top_trennid_paevade_loikes$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))

Kuidas jagunevad spordialad trenni kestvuse lõikes?

top_trennid_kestvus <- trennid_toodeldud %>% 
  # arvuta iga trenni pikkus minutites
  mutate(kestvus_minutites = as.numeric(as.duration(trenni_kestvus)) / 60) %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date),
         kestvus_minutites <= 120) %>%  # max 2 H pikad trennid 
  select(spordiala_est, kestvus_minutites, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  select(-profile) %>% 
  ungroup() %>% 
  mutate(n_total = n(),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, kestvus_minutites)
longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthJoining, by = "spordiala"
top_trennid_kestvus %>% 
  ggplot(aes(kestvus_minutites, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(0, 120, by = 30),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "kestvus, min",
       y = "spordiala",
       title = "Kui pikalt trenni tehakse?",
       subtitle = str_c("ca ", round(max(top_trennid_kestvus$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))

Kuidas jagunevad spordialade lõikes inimeste vanus?

top_trennid_vanus <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  mutate(vanus_praegu = round(as.numeric(difftime(today(), synnipaev, units = "days")) / 365, 0)) %>% 
  filter(!is.na(synnipaev), 
         vanus_praegu >= 10,
         vanus_praegu <= 70) %>%
  select(spordiala_est, vanus_praegu, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  distinct(profile, spordiala_est, .keep_all = TRUE) %>% 
  ungroup() %>% 
  mutate(n_total = n_distinct(profile),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  select(-profile) %>% 
  arrange(spordiala_est, vanus_praegu)
Joining, by = "spordiala"
top_trennid_vanus %>% 
  ggplot(aes(vanus_praegu, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(10, 70, by = 10),
                     limits = c(10, 70),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "vanus",
       y = "spordiala",
       title = "Mis vanuses trenni tehakse?",
       subtitle = str_c("ca ", round(max(top_trennid_vanus$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondo konto andmetel"))

Vanus ja lemmik spordiala.

top_trennid_lemmik_spordiala_vanus <- trennid_toodeldud %>% 
  # grupeeri mõned spordialad kokku
  mutate(lemmik_spordiala = case_when(.$lemmik_spordiala == "Walking (Fitness)" ~ "walking",
                                      .$lemmik_spordiala == "Skiing (Cross country)" ~ "skiing",
                                      .$lemmik_spordiala == "Football (Soccer)" ~ "soccer",
                                      TRUE ~ .$lemmik_spordiala),
         lemmik_spordiala = str_to_lower(lemmik_spordiala)) %>% 
  filter(lemmik_spordiala %in% top_10_lemmik_spordiala) %>% 
  left_join(spordiala_est, by = c("lemmik_spordiala" = "spordiala")) %>% 
  mutate(vanus_praegu = round(as.numeric(difftime(today(), synnipaev, units = "days")) / 365, 0)) %>% 
  filter(!is.na(synnipaev), 
         vanus_praegu >= 10,
         vanus_praegu <= 70,
         !is.na(lemmik_spordiala)) %>%
  select(lemmik_spordiala_est = spordiala_est, vanus_praegu, profile) %>% 
  group_by(lemmik_spordiala_est) %>%
  mutate(n = n()) %>% 
  # iga kasutaja ühekordselt
  distinct(profile, lemmik_spordiala_est, .keep_all = TRUE) %>% 
  ungroup() %>% 
  mutate(n_total = n_distinct(profile),
         lemmik_spordiala_est = fct_reorder(lemmik_spordiala_est, n)) %>% 
  select(-profile) %>% 
  arrange(lemmik_spordiala_est, vanus_praegu)
top_trennid_lemmik_spordiala_vanus %>% 
  ggplot(aes(vanus_praegu, lemmik_spordiala_est)) +
  geom_joy(aes(fill = lemmik_spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(10, 70, by = 10),
                     limits = c(10, 70),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "vanus",
       y = "lemmik spordiala",
       title = "Lemmik spordiala vs vanus",
       subtitle = str_c("ca ", round(max(top_trennid_lemmik_spordiala_vanus$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondo konto andmetel"))

Kuidas mõjutab vanus mediaan keskmist jooksu kiirust?

trennid_toodeldud %>% 
  ungroup() %>% 
  mutate(kestvus_minutites = as.numeric(as.duration(trenni_kestvus)) / 60) %>% 
  filter(!is.na(synnipaev), 
         spordiala == "running", 
         !is.na(trenni_kestvus),
         !is.na(sugu)) %>% 
  mutate(vanus_praegu = as.numeric(difftime(today(), synnipaev, units = "days")) / 365,
         kiirus = kestvus_minutites / distants_km) %>% 
  # filter(kiirus < 15, kiirus > 1) %>%
  group_by(profile, vanus_praegu, sugu) %>%
  summarise(kiirus_median = median(kiirus, na.rm = TRUE),
            n = n()) %>% 
  filter(n >= 10, kiirus_median <= 10, kiirus_median >= 3) %>% 
  ungroup() %>% 
  ggplot(aes(vanus_praegu, kiirus_median, colour = sugu)) +
  geom_point() +
  theme_ipsum_rc() +
  labs(x = "vanus",
       y = "tempo min/km",
       title = "Keskmise jooksu tempo ja vanuse seos",
       subtitle = "Iga punkt tähistab ühe inimese mediaan tempot")
longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length

Milline on vanuse ja mediaan keskmine joksu distantsi vaheline seos?

trennid_toodeldud %>% 
  ungroup() %>% 
  filter(!is.na(synnipaev), 
         spordiala == "running", 
         !is.na(distants_km),
         !is.na(sugu)) %>% 
  mutate(vanus_praegu = as.numeric(difftime(today(), synnipaev, units = "days")) / 365) %>% 
  group_by(profile, vanus_praegu, sugu) %>%
  summarise(distance_median = median(distants_km, na.rm = TRUE),
            n = n()) %>% 
  filter(n >= 10, vanus_praegu >= 10, distance_median <= 15) %>% 
  ungroup() %>% 
  ggplot(aes(vanus_praegu, distance_median, colour = sugu)) +
  geom_point() +
  theme_ipsum_rc() +
  labs(x = "vanus",
       y = "distants, km",
       title = "Keskmise jooksu distantsi ja vanuse seos",
       subtitle = "Iga punkt tähistab ühe inimese mediaan distantsi")

---
title: "Visualiseeri trenni andmeid"
output: html_notebook
---


```{r}
library(tidyverse)
library(stringr)
library(anytime)
library(lubridate)
library(skimr)
library(listviewer)
library(rlang)
library(forcats)
library(ggjoy)
library(padr)
library(hrbrthemes)
library(janitor)

load("output/trennid_toodeldud.RData")
```

### Tutvu andmetega
```{r}
glimpse(trennid_toodeldud)
```

Ülevaade kõigist veergudest
```{r}
# Funktsioon leiab valitud veeru kõik unikaalsed väärtused koos esinemissagedusega
unikaalsed <- function(x){
    valik <- sym(x)
    
    p <- trennid_toodeldud %>% 
        mutate(uus = str_replace_na(!!valik)) %>% 
        count(uus) %>% 
        mutate(uus = str_c(uus, " (", n, ")"),
               uus = fct_reorder(uus, n)) %>% 
        arrange(desc(n)) %>% 
        select(uus)
    
    colnames(p) <- x
    
    return(p)
}

# kõik veerud, mis pole ID või muul põhjusel välistatud
veerud <- trennid_toodeldud %>% 
    select(-profile, -workout, -contains("raw")) %>%
    colnames()

# leia data frame kõigi veergude unikaalsed väärtused
p <- map(veerud, unikaalsed)

# kuva unikaalsed väärtused
p %>% 
    map(., as.list) %>% 
    flatten() %>% 
    jsonedit()
```

TOP 15 populaarsemat spordiala
```{r}
top_15_spordiala <- trennid_toodeldud %>% 
  filter(spordiala != "exercising") %>%  # välista mittemidagiütlev "exercising"
  count(spordiala, sort = TRUE) %>% 
  head(15) %>% 
  .$spordiala

top_15_spordiala
```

TOP 10 lemmik spordiala
```{r}
top_10_lemmik_spordiala <- trennid_toodeldud %>% 
  distinct(profile, lemmik_spordiala) %>% 
  mutate(lemmik_spordiala = case_when(.$lemmik_spordiala == "Walking (Fitness)" ~ "walking",
                                      .$lemmik_spordiala == "Skiing (Cross country)" ~ "skiing",
                                      .$lemmik_spordiala == "Football (Soccer)" ~ "soccer",
                                      TRUE ~ .$lemmik_spordiala),
         lemmik_spordiala = str_to_lower(lemmik_spordiala)) %>% 
  count(lemmik_spordiala, sort = TRUE) %>% 
  head(10) %>% 
  .$lemmik_spordiala

top_10_lemmik_spordiala
```

Populaarsemad spordialad eesti keeles
```{r}
spordiala_est <- tribble(
  ~spordiala, ~spordiala_est,
  "walking", "käimine",
  "running", "jooksmine",
  "cycling", "jalgrattasõit",
  "cycling (sport)", "jalgrattasõit (sport)",
  "cycling (transport)", "jalgrattasõit (transport)",
  "mountain biking", "maastikuratta sõit",
  "skating", "rulluisutamine",
  "roller skating", "rulluisutamine",
  "doing weight training", "jõutrenn",  
  "weight training", "jõutrenn",  
  "skiing", "suusatamine",  
  "hiking", "matkamine",  
  "dancing", "tantsimine",  
  "doing aerobics", "aeroobika",
  "doing circuit training", "ringtreening",
  "swimming", "ujumine",  
  "orienteering", "orienteerumine",  
  "roller skiing", "rullsuusatamine",  
  "golfing", "golf",
  "soccer", "jalgpall"
)
```


Ülevaade analüüsis kasutatud trennide ja inimetse arvust 
```{r, fig.height=8, fig.width=8}
trennid_toodeldud %>% 
  filter(!is.na(spordiala)) %>% 
  group_by(spordiala) %>% 
  summarise(trennide_arv = n(),
            kasutajate_arv = n_distinct(profile)) %>% 
  ungroup() %>% 
  mutate(label = str_c(trennide_arv, " (", kasutajate_arv, ")")) %>% 
  arrange(desc(trennide_arv)) %>% 
  head(30) %>% 
  ggplot(aes(fct_reorder(spordiala, trennide_arv), trennide_arv)) +
  geom_col() +
  geom_text(aes(label = label), hjust = -0.1) +
  scale_y_continuous(breaks = (seq(0, 35000, by = 10000)),
                     limits = c(0, 35000),
                     expand = c(0, 0)) +
  coord_flip() +
  theme_ipsum_rc() +
  labs(y = "trennide arv",
       x = "spordiala",
       title = "Trennide ja treenijate arv kokku",
       subtitle = "TOP 30 spordiala\nSulgudes on spordialaga tegelenud inimeste arv")

```



Mis kell tehakse trenni nädalavahetusel?
```{r, fig.width = 8, fig.height = 8}
top_trennid_nadalavahetusel <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>%  # ainult top 15 alad
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date)) %>%  # välista ilma kellaajata trennid
  mutate(nadalapaev = weekdays(workout_date)) %>%  # leia trenni nädalapäev
  filter(nadalapaev %in% c("Saturday", "Sunday")) %>%  # ainult nädalavahetused
  thicken(by = "workout_date",  interval = "15 mins") %>%  # ümarda kõik kellaajad 15 min täpsusega
  # asenda kõik kuupäevad 01.01.2017
  # nii on lihtsam andmed ühele skaalale viia
  mutate(aeg = ymd_hms(str_replace(workout_date_15_min, "201.-..-..", "2017-01-01"))) %>% 
  select(spordiala_est, aeg) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  filter(aeg > ymd_hms("2017-01-01 06:00:00")) %>%  # ainult pärast kl 6 tehtud trennid
  ungroup() %>% 
  mutate(n_total = n(),  # kogu trenni arvu kuvamiseks
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, aeg)


top_trennid_nadalavahetusel %>% 
  ggplot(aes(aeg, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  # formati x-teljel kellaaeg
  scale_x_datetime(labels = function(x) format(x, "%H:%M"),
                   breaks = seq(ymd_hms("2017-01-01 06:00:00"),
                                ymd_hms("2017-01-01 24:00:00"), "3 hours"),
                   expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +  # vahelduvad värvitoonid
  labs(x = "kellaaeg",
       y = "spordiala",
       title = "Mis kell tehakse trenni nädalavahetusel?",
       subtitle = str_c("ca ", round(max(top_trennid_nadalavahetusel$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))

```

Mis kell tehakse trenni tööpäevadel?
```{r, fig.width = 8, fig.height = 8}
top_trennid_toopaevadel <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date)) %>% 
  mutate(nadalapaev = weekdays(workout_date)) %>% 
  filter(!nadalapaev %in% c("Saturday", "Sunday")) %>% 
  thicken(by = "workout_date",  interval = "15 mins") %>% 
  mutate(aeg = ymd_hms(str_replace(workout_date_15_min, "201.-..-..", "2017-01-01"))) %>% 
  select(spordiala_est, aeg, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  select(-profile) %>% 
  filter(aeg > ymd_hms("2017-01-01 06:00:00")) %>% 
  ungroup() %>% 
  mutate(n_total = n(),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, aeg)


top_trennid_toopaevadel %>% 
  ggplot(aes(aeg, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_datetime(labels = function(x) format(x, "%H:%M"),
                   breaks = seq(ymd_hms("2017-01-01 06:00:00"),
                                ymd_hms("2017-01-01 24:00:00"), "3 hours"),
                   expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "kellaaeg",
       y = "spordiala",
       title = "Mis kell tehakse trenni tööpäevadel?",
       subtitle = str_c("ca ", round(max(top_trennid_toopaevadel$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))

```

Mis nädalapäevadel mingit spordiala harrastatakse?
```{r, fig.width = 8, fig.height = 8}
top_trennid_paevade_loikes <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date)) %>% 
  # nädalapäeva number (1 = esmaspäev)
  mutate(nadalapaev = wday(workout_date, week_start = 1)) %>% 
  select(spordiala_est, nadalapaev, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  select(-profile) %>% 
  ungroup() %>% 
  mutate(n_total = n(),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, nadalapaev)


top_trennid_paevade_loikes %>% 
  ggplot(aes(nadalapaev, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  geom_vline(xintercept = 5.5) +
  annotate("text", x = 6.5, y = 16.5, label = "nädalavahetuse") +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(1, 7, by = 1),
                     labels = c("esmasp", "teisip", "kolmap", "neljap", "reede", "laup", "pühap"),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "päev",
       y = "spordiala",
       title = "Mis päeval spordiala harrastatakse?",
       subtitle = str_c("ca ", round(max(top_trennid_paevade_loikes$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))
```


Kuidas jagunevad spordialad trenni kestvuse lõikes?
```{r, fig.width = 8, fig.height = 8}
top_trennid_kestvus <- trennid_toodeldud %>% 
  # arvuta iga trenni pikkus minutites
  mutate(kestvus_minutites = as.numeric(as.duration(trenni_kestvus)) / 60) %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  filter(!is.na(workout_date),
         kestvus_minutites <= 120) %>%  # max 2 H pikad trennid 
  select(spordiala_est, kestvus_minutites, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  select(-profile) %>% 
  ungroup() %>% 
  mutate(n_total = n(),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  arrange(spordiala_est, kestvus_minutites)


top_trennid_kestvus %>% 
  ggplot(aes(kestvus_minutites, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(0, 120, by = 30),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "kestvus, min",
       y = "spordiala",
       title = "Kui pikalt trenni tehakse?",
       subtitle = str_c("ca ", round(max(top_trennid_kestvus$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondos logitud trenni põhjal"))
```


Kuidas jagunevad spordialade lõikes inimeste vanus?
```{r, fig.width = 8, fig.height = 8}
top_trennid_vanus <- trennid_toodeldud %>% 
  filter(spordiala %in% top_15_spordiala) %>% 
  left_join(spordiala_est) %>% 
  mutate(vanus_praegu = round(as.numeric(difftime(today(), synnipaev, units = "days")) / 365, 0)) %>% 
  filter(!is.na(synnipaev), 
         vanus_praegu >= 10,
         vanus_praegu <= 70) %>%
  select(spordiala_est, vanus_praegu, profile) %>% 
  group_by(spordiala_est) %>%
  mutate(n = n()) %>% 
  distinct(profile, spordiala_est, .keep_all = TRUE) %>% 
  ungroup() %>% 
  mutate(n_total = n_distinct(profile),
         spordiala_est = fct_reorder(spordiala_est, n)) %>% 
  select(-profile) %>% 
  arrange(spordiala_est, vanus_praegu)


top_trennid_vanus %>% 
  ggplot(aes(vanus_praegu, spordiala_est)) +
  geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(10, 70, by = 10),
                     limits = c(10, 70),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "vanus",
       y = "spordiala",
       title = "Mis vanuses trenni tehakse?",
       subtitle = str_c("ca ", round(max(top_trennid_vanus$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondo konto andmetel"))
```

Vanus ja lemmik spordiala.
```{r, fig.width = 8, fig.height = 7}
top_trennid_lemmik_spordiala_vanus <- trennid_toodeldud %>% 
  # grupeeri mõned spordialad kokku
  mutate(lemmik_spordiala = case_when(.$lemmik_spordiala == "Walking (Fitness)" ~ "walking",
                                      .$lemmik_spordiala == "Skiing (Cross country)" ~ "skiing",
                                      .$lemmik_spordiala == "Football (Soccer)" ~ "soccer",
                                      TRUE ~ .$lemmik_spordiala),
         lemmik_spordiala = str_to_lower(lemmik_spordiala)) %>% 
  filter(lemmik_spordiala %in% top_10_lemmik_spordiala) %>% 
  left_join(spordiala_est, by = c("lemmik_spordiala" = "spordiala")) %>% 
  mutate(vanus_praegu = round(as.numeric(difftime(today(), synnipaev, units = "days")) / 365, 0)) %>% 
  filter(!is.na(synnipaev), 
         vanus_praegu >= 10,
         vanus_praegu <= 70,
         !is.na(lemmik_spordiala)) %>%
  select(lemmik_spordiala_est = spordiala_est, vanus_praegu, profile) %>% 
  group_by(lemmik_spordiala_est) %>%
  mutate(n = n()) %>% 
  # iga kasutaja ühekordselt
  distinct(profile, lemmik_spordiala_est, .keep_all = TRUE) %>% 
  ungroup() %>% 
  mutate(n_total = n_distinct(profile),
         lemmik_spordiala_est = fct_reorder(lemmik_spordiala_est, n)) %>% 
  select(-profile) %>% 
  arrange(lemmik_spordiala_est, vanus_praegu)


top_trennid_lemmik_spordiala_vanus %>% 
  ggplot(aes(vanus_praegu, lemmik_spordiala_est)) +
  geom_joy(aes(fill = lemmik_spordiala_est), scale = 2, colour = "white", size = 0.7) +
  theme_ipsum_rc() +
  scale_y_discrete(expand = c(0.01, 0)) +
  scale_x_continuous(breaks = seq(10, 70, by = 10),
                     limits = c(10, 70),
                     expand = c(0, 0)) +
  scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
  labs(x = "vanus",
       y = "lemmik spordiala",
       title = "Lemmik spordiala vs vanus",
       subtitle = str_c("ca ", round(max(top_trennid_lemmik_spordiala_vanus$n_total, na.rm = TRUE) / 1000, 0),
                        " 000 Endomondo konto andmetel"))
```


Kuidas mõjutab vanus mediaan keskmist jooksu kiirust?
```{r}
trennid_toodeldud %>% 
  ungroup() %>% 
  mutate(kestvus_minutites = as.numeric(as.duration(trenni_kestvus)) / 60) %>% 
  filter(!is.na(synnipaev), 
         spordiala == "running", 
         !is.na(trenni_kestvus),
         !is.na(sugu)) %>% 
  mutate(vanus_praegu = as.numeric(difftime(today(), synnipaev, units = "days")) / 365,
         kiirus = kestvus_minutites / distants_km) %>% 
  # filter(kiirus < 15, kiirus > 1) %>%
  group_by(profile, vanus_praegu, sugu) %>%
  summarise(kiirus_median = median(kiirus, na.rm = TRUE),
            n = n()) %>% 
  filter(n >= 10, kiirus_median <= 10, kiirus_median >= 3) %>% 
  ungroup() %>% 
  ggplot(aes(vanus_praegu, kiirus_median, colour = sugu)) +
  geom_point() +
  theme_ipsum_rc() +
  labs(x = "vanus",
       y = "tempo min/km",
       title = "Keskmise jooksu tempo ja vanuse seos",
       subtitle = "Iga punkt tähistab ühe inimese mediaan tempot")
```


Milline on vanuse ja mediaan keskmine joksu distantsi vaheline seos?
```{r}
trennid_toodeldud %>% 
  ungroup() %>% 
  filter(!is.na(synnipaev), 
         spordiala == "running", 
         !is.na(distants_km),
         !is.na(sugu)) %>% 
  mutate(vanus_praegu = as.numeric(difftime(today(), synnipaev, units = "days")) / 365) %>% 
  group_by(profile, vanus_praegu, sugu) %>%
  summarise(distance_median = median(distants_km, na.rm = TRUE),
            n = n()) %>% 
  filter(n >= 10, vanus_praegu >= 10, distance_median <= 15) %>% 
  ungroup() %>% 
  ggplot(aes(vanus_praegu, distance_median, colour = sugu)) +
  geom_point() +
  theme_ipsum_rc() +
  labs(x = "vanus",
       y = "distants, km",
       title = "Keskmise jooksu distantsi ja vanuse seos",
       subtitle = "Iga punkt tähistab ühe inimese mediaan distantsi")
```
